Comparing two datasets with Venn Diagram

Author

Cordeliers Artificial Intelligence and Bioinformatics

Published

May 9, 2025

Here is an example of an application for the function plot_venn. For this script, we used 2 different public datasets airway and macrophage.

The airway dataset provides a gene expression dataset derived from human bronchial epithelial cells, treated or not with dexamethasone (a corticosteroid).

The macrophage dataset provides bulk RNA-seq count data from human monocyte-derived macrophages. These cells were either left untreated or stimulated with interferon-gamma (IFN-γ). An experimental condition was applied to each of these cells, left unstimulated, stimulated with IFN-gamma, infected with SL1344, or pre-treated with IFN-g and then infected with SL1344. For this comparison, we will focus on the samples of the 2 first conditions (IFN-g, naive).

To compare the datasets, we will do the pre-processing part of the sup and unsup analysis, until the diffexp.

annotation_air <- "dex"
annotation_mac <- "condition_name"

diffexpMethod <- "limma"

Import libraries

library(airway)
library(SummarizedExperiment)
library(macrophage)
library(biomaRt)
library(CAIBIrnaseq)

Load datasets

#Loading the 2 datasets
data(airway, package="airway"); 
airway <- airway

data("gse")
macrophage <- gse; rm(gse)
macrophage <- macrophage[,macrophage$condition %in% c("naive", "IFNg")] # focus on these 2 conditions

Even if the datasets are globally build the same way, the names of the variables are not exactly the same, so if we want to keep the same code, we need to redefine a bit the variables.

If you want to know what are the used variables in this part, run this command line : colnames(SummarizedExperiment::rowData(exp_data)) You should have these variables (with these exact same names): - gene_name - gene_id - gene_length_kb - gene_description - gene_biotype

If not, you should look at how your dataset is defined. You might need to run some command line as the following ones:

rowData(airway)$gene_length_kb <-(rowData(airway)$gene_seq_end - rowData(airway)$gene_seq_start) / 1000

mart_air <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

gene_ids_air <- rowData(airway)$gene_id

annot_air <- getBM(attributes = c("ensembl_gene_id", "description"),
               filters = "ensembl_gene_id",
               values = gene_ids_air,
               mart = mart_air)

matched_air <- match(rowData(airway)$gene_id, annot_air$ensembl_gene_id)
rowData(airway)$gene_description <- annot_air$description[matched_air]
rowData(macrophage)$gene_length_kb <- width(rowRanges(macrophage)) / 1000
rownames(macrophage) <- sub("\\..*$", "", rowData(macrophage)$gene_id)
rowData(macrophage)$gene_id <- sub("\\..*$", "", rowData(macrophage)$gene_id)

rowData(macrophage)$gene_name <- rowData(macrophage)$SYMBOL

macrophage <- macrophage[!is.na(rowData(macrophage)$SYMBOL), ]

mart_mac <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

annot_mac <- getBM(
  attributes = c("ensembl_gene_id", "gene_biotype", "description"),
  filters = "ensembl_gene_id",
  values = unique(rowData(macrophage)$gene_id),
  mart = mart_mac
)
# Correspondance entre gene_id et annot_mac
match_mac <- match(rowData(macrophage)$gene_id, annot_mac$ensembl_gene_id)

# Ajouter les colonnes dans rowData
rowData(macrophage)$gene_biotype <- annot_mac$gene_biotype[match_mac]
rowData(macrophage)$gene_description <- annot_mac$description[match_mac]

Pre-processing

airway <- rebase_gexp(airway, annotation = "gene_name")
macrophage <- rebase_gexp(macrophage, annotation = "SYMBOL") # or gene_name if you renamed the variable 

Filter

airway <- filter_gexp(airway,
                        min_nsamp = 1, 
                        min_counts = 1)
macrophage <- filter_gexp(macrophage,
                        min_nsamp = 1, 
                        min_counts = 1)

Quality Control

colData(airway)$sample_id <- colnames(airway)
plot_qc_filters(airway)
Saving 7 x 5 in image
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues
colData(macrophage)$sample_id <- colnames(macrophage)
plot_qc_filters(macrophage)
Saving 7 x 5 in image
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
geom_GeomTextRepel() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Normalize

airway <- normalize_gexp(airway)
assay(macrophage) <- round(assay(macrophage))
macrophage <- normalize_gexp(macrophage)

PCA

#|message: false
metadata(airway)[["pca_res"]] <- pca_gexp(airway)
annotations <- setdiff(annotation_air, c("exp_cluster", "path_cluster"))
plot_pca(airway, color = annotation_air)
Saving 7 x 5 in image
metadata(macrophage)[["pca_res"]] <- pca_gexp(macrophage)
annotations <- setdiff(annotation_mac, c("exp_cluster", "path_cluster"))
plot_pca(macrophage, color = annotation_mac)
Saving 7 x 5 in image

Diffexp

library(edgeR)
colData(airway)$dex <- factor(colData(airway)$dex)
colData(airway)$dex <- factor(colData(airway)$dex, levels = c("untrt", "trt"))


diffexp_air <- diffExpAnalysis(countData = assays(airway)$counts,
                                          sampleInfo = colData(airway),
                                          method = diffexpMethod, cutoff = 10,
                                          annotation = annotation_air)

colData(macrophage)$condition_name <- factor(colData(macrophage)$condition_name)
colData(macrophage)$condition_name <- factor(colData(macrophage)$condition_name, levels = c("IFNg", "naive"))


diffexp_mac <- diffExpAnalysis(countData = assays(macrophage)$counts,
                                          sampleInfo = colData(macrophage),
                                          method = diffexpMethod, cutoff = 10,
                                          annotation = annotation_mac)

Venn Diagram

#|message: false
if(tolower(diffexpMethod) == "limma") {
  diffexp_air <- diffexp_air |>
    dplyr::rename(
      log2FoldChange = logFC,
      pvalue = P.Value,
      padj = adj.P.Val
    )
}

if(tolower(diffexpMethod) == "limma"){
  diffexp_mac <- diffexp_mac |>
    dplyr::rename(
      log2FoldChange = logFC,
      pvalue = P.Value,
      padj = adj.P.Val
    )
}

# Exemple : extraction des gènes DE significatifs
degs_air <- rownames(diffexp_air[diffexp_air$padj < 0.05, ])
degs_mac <- rownames(diffexp_mac[diffexp_mac$padj < 0.05, ])
universe_size <- length(union(degs_air, degs_mac))

# Diagramme de Venn
plot_venn(degs_air, degs_mac, universe_size,
          v1_name = "Airway", v2_name = "Macrophage",
          fills = c("lightgreen", "skyblue"),
          title = "DEGs communs et spécifiques")

inter <- intersect(degs_air, degs_mac)

msigdbr::msigdbr_collections() |>
  kableExtra::kbl() |>
  kableExtra::kable_styling() |>
  kableExtra::scroll_box(height = "300px")
gs_collection gs_subcollection gs_collection_name num_genesets
C1 Positional 302
C2 CGP Chemical and Genetic Perturbations 3494
C2 CP Canonical Pathways 19
C2 CP:BIOCARTA BioCarta Pathways 292
C2 CP:KEGG_LEGACY KEGG Legacy Pathways 186
C2 CP:KEGG_MEDICUS KEGG Medicus Pathways 658
C2 CP:PID PID Pathways 196
C2 CP:REACTOME Reactome Pathways 1736
C2 CP:WIKIPATHWAYS WikiPathways 830
C3 MIR:MIRDB miRDB 2377
C3 MIR:MIR_LEGACY MIR_Legacy 221
C3 TFT:GTRD GTRD 505
C3 TFT:TFT_LEGACY TFT_Legacy 610
C4 3CA Curated Cancer Cell Atlas gene sets 148
C4 CGN Cancer Gene Neighborhoods 427
C4 CM Cancer Modules 431
C5 GO:BP GO Biological Process 7608
C5 GO:CC GO Cellular Component 1026
C5 GO:MF GO Molecular Function 1820
C5 HPO Human Phenotype Ontology 5653
C6 Oncogenic Signature 189
C7 IMMUNESIGDB ImmuneSigDB 4872
C7 VAX HIPC Vaccine Response 347
C8 Cell Type Signature 840
H Hallmark 50
pathway_collections <- c("CP:KEGG_MEDICUS")
pathways <- get_annotation_collection(pathway_collections, 
                                      species = "human")
-- Collecting CP:KEGG_MEDICUS from MSigDB...
dir.create(file.path("results", "airway", "pathways"), recursive = TRUE, showWarnings = FALSE)

# airway
pathway_scores <- score_pathways(airway, pathways, verbose = FALSE)
! Duplicated gene IDs removed from gene set KEGG_MEDICUS_PATHOGEN_HTLV_1_TAX_TO_NFY_MEDIATED_TRANSCRIPTION
! Duplicated gene IDs removed from gene set KEGG_MEDICUS_REFERENCE_ANTIGEN_PROCESSING_AND_PRESENTATION_BY_MHC_CLASS_II_MOLECULES
metadata(airway)[["pathway_scores"]] <- pathway_scores

collections <- pathway_collections |> 
  paste(collapse = "_") |>
  stringr::str_remove("\\:")

plot_pathway_heatmap(airway, annotations = annotation_air,
                    fwidth = 9,
                    fname = stringr::str_glue(
                      "results/airway/pathways/hm_paths_{collections}_top20.pdf")
                    )
Warning in prep_scores_hm(exp_data, pathway_scores, pathways): 'sample_id'
already exists in colData and will be overwritten.
-- Saving heatmap at results/airway/pathways/hm_paths_CPKEGG_MEDICUS_top20.pdf

write.csv(pathways, file = stringr::str_glue("results/airway/pathways/paths_{collections}.csv"))